Loading dataset
Telecom_Data <- data.frame(read.csv("Telecom Data.csv"))
ncol(Telecom_Data)
## [1] 58
nrow(Telecom_Data)
## [1] 51047
There are total 58 Columns and 51,047 Rows
Converting few columns to factor
Telecom_Data$Churn <- factor(Telecom_Data$Churn)
Telecom_Data$CreditRating <- factor(Telecom_Data$CreditRating)
Telecom_Data$Occupation <- factor(Telecom_Data$Occupation)
Let’s check for null values
library(dplyr)
library(tidyr)
## Checking the null values in the dataset
#summary(Telecom_Data)
#is.null(Telecom_Data)
null_values<-sapply(Telecom_Data, function(x) sum(is.na(x)))
null_values
## CustomerID Churn MonthlyRevenue
## 0 0 156
## MonthlyMinutes TotalRecurringCharge DirectorAssistedCalls
## 156 156 156
## OverageMinutes RoamingCalls PercChangeMinutes
## 156 156 367
## PercChangeRevenues DroppedCalls BlockedCalls
## 367 0 0
## UnansweredCalls CustomerCareCalls ThreewayCalls
## 0 0 0
## ReceivedCalls OutboundCalls InboundCalls
## 0 0 0
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls
## 0 0 0
## CallForwardingCalls CallWaitingCalls MonthsInService
## 0 0 0
## UniqueSubs ActiveSubs ServiceArea
## 0 0 0
## Handsets HandsetModels CurrentEquipmentDays
## 1 1 1
## AgeHH1 AgeHH2 ChildrenInHH
## 909 909 0
## HandsetRefurbished HandsetWebCapable TruckOwner
## 0 0 0
## RVOwner Homeownership BuysViaMailOrder
## 0 0 0
## RespondsToMailOffers OptOutMailings NonUSTravel
## 0 0 0
## OwnsComputer HasCreditCard RetentionCalls
## 0 0 0
## RetentionOffersAccepted NewCellphoneUser NotNewCellphoneUser
## 0 0 0
## ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## 0 0 0
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam
## 0 0 0
## CreditRating PrizmCode Occupation
## 0 0 0
## MaritalStatus
## 0
Few columns have null values but the count is less,
Lets Create New Variables which will help in analysis
##Creation of new variables for our analysis
Telecom_Data$perc_recurrent_charge <- (Telecom_Data$TotalRecurringCharge /Telecom_Data$MonthlyRevenue) * 100
Telecom_Data$perc_overage_minute <- (Telecom_Data$OverageMinutes / Telecom_Data$MonthlyMinutes) * 100
str(Telecom_Data)
## 'data.frame': 51047 obs. of 60 variables:
## $ CustomerID : int 3000002 3000010 3000014 3000022 3000026 3000030 3000038 3000042 3000046 3000050 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 1 ...
## $ MonthlyRevenue : num 24 17 38 82.3 17.1 ...
## $ MonthlyMinutes : int 219 10 8 1312 0 682 26 98 24 1056 ...
## $ TotalRecurringCharge : int 22 17 38 75 17 52 30 66 35 75 ...
## $ DirectorAssistedCalls : num 0.25 0 0 1.24 0 0.25 0.25 2.48 0 0 ...
## $ OverageMinutes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RoamingCalls : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PercChangeMinutes : int -157 -4 -2 157 0 148 60 24 20 43 ...
## $ PercChangeRevenues : num -19 0 0 8.1 -0.2 -3.1 4 6.8 -0.3 2.4 ...
## $ DroppedCalls : num 0.7 0.3 0 52 0 9 0 0 0 0 ...
## $ BlockedCalls : num 0.7 0 0 7.7 0 1.7 1 0.3 0 0 ...
## $ UnansweredCalls : num 6.3 2.7 0 76 0 13 2.3 4 1 0 ...
## $ CustomerCareCalls : num 0 0 0 4.3 0 0.7 0 4 0 0 ...
## $ ThreewayCalls : num 0 0 0 1.3 0 0 0 0 0 0 ...
## $ ReceivedCalls : num 97.2 0 0.4 200.3 0 ...
## $ OutboundCalls : num 0 0 0.3 370.3 0 ...
## $ InboundCalls : num 0 0 0 147 0 0 0 0 1.7 0 ...
## $ PeakCallsInOut : num 58 5 1.3 555.7 0 ...
## $ OffPeakCallsInOut : num 24 1 3.7 303.7 0 ...
## $ DroppedBlockedCalls : num 1.3 0.3 0 59.7 0 10.7 1 0.3 0 0 ...
## $ CallForwardingCalls : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CallWaitingCalls : num 0.3 0 0 22.7 0 0.7 0 0 0 0 ...
## $ MonthsInService : int 61 58 60 59 53 53 57 59 53 55 ...
## $ UniqueSubs : int 2 1 1 2 2 1 2 2 3 1 ...
## $ ActiveSubs : int 1 1 1 2 2 1 2 2 3 1 ...
## $ ServiceArea : chr "SEAPOR503" "PITHOM412" "MILMIL414" "PITHOM412" ...
## $ Handsets : int 2 2 1 9 4 3 2 3 4 9 ...
## $ HandsetModels : int 2 1 1 4 3 2 2 3 3 5 ...
## $ CurrentEquipmentDays : int 361 1504 1812 458 852 231 601 464 544 388 ...
## $ AgeHH1 : int 62 40 26 30 46 28 52 46 36 46 ...
## $ AgeHH2 : int 0 42 26 0 54 0 58 46 34 68 ...
## $ ChildrenInHH : chr "No" "Yes" "Yes" "No" ...
## $ HandsetRefurbished : chr "No" "No" "No" "No" ...
## $ HandsetWebCapable : chr "Yes" "No" "No" "Yes" ...
## $ TruckOwner : chr "No" "No" "No" "No" ...
## $ RVOwner : chr "No" "No" "No" "No" ...
## $ Homeownership : chr "Known" "Known" "Unknown" "Known" ...
## $ BuysViaMailOrder : chr "Yes" "Yes" "No" "Yes" ...
## $ RespondsToMailOffers : chr "Yes" "Yes" "No" "Yes" ...
## $ OptOutMailings : chr "No" "No" "No" "No" ...
## $ NonUSTravel : chr "No" "No" "No" "No" ...
## $ OwnsComputer : chr "Yes" "Yes" "No" "No" ...
## $ HasCreditCard : chr "Yes" "Yes" "Yes" "Yes" ...
## $ RetentionCalls : int 1 0 0 0 0 0 0 0 0 0 ...
## $ RetentionOffersAccepted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NewCellphoneUser : chr "No" "Yes" "Yes" "Yes" ...
## $ NotNewCellphoneUser : chr "No" "No" "No" "No" ...
## $ ReferralsMadeBySubscriber: int 0 0 0 0 0 0 0 0 0 0 ...
## $ IncomeGroup : int 4 5 6 6 9 1 9 6 9 5 ...
## $ OwnsMotorcycle : chr "No" "No" "No" "No" ...
## $ AdjustmentsToCreditRating: int 0 0 0 0 1 1 1 0 0 1 ...
## $ HandsetPrice : chr "30" "30" "Unknown" "10" ...
## $ MadeCallToRetentionTeam : chr "Yes" "No" "No" "No" ...
## $ CreditRating : Factor w/ 7 levels "1-Highest","2-High",..: 1 4 3 4 1 3 1 1 1 3 ...
## $ PrizmCode : chr "Suburban" "Suburban" "Town" "Other" ...
## $ Occupation : Factor w/ 8 levels "Clerical","Crafts",..: 5 5 2 4 5 4 7 5 4 5 ...
## $ MaritalStatus : chr "No" "Yes" "Yes" "No" ...
## $ perc_recurrent_charge : num 91.7 100.1 100 91.2 99.2 ...
## $ perc_overage_minute : num 0 0 0 0 NaN 0 0 0 0 0 ...
Lets calculate churn rate
## Getting Churn counts
churn_counts<- dplyr::count(Telecom_Data,Churn , sort = TRUE)
Lets plot the churn rate using pie chart
library("ggplot2")
ggplot(data = churn_counts, aes(x = "", y = n, fill = Churn)) +
geom_bar(stat = "identity") +
coord_polar("y")
Trying out plotly for pie chart for more interactive graphs
library(plotly)
colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')
fig <- plot_ly(type='pie', labels=churn_counts$Churn, values=churn_counts$n,
textinfo='label+percent',
insidetextorientation='radial',marker = list(colors = colors,
line = list(color = '#FFFFFF', width = 1)))
fig
Subsetting data for in depth analysis on the basis of churn and checking the summary of the divided data to analyze the trend
library(dplyr)
Telecom_Data_yes = filter(Telecom_Data, Churn == "Yes")
Telecom_Data_no = filter(Telecom_Data, Churn == "No")
summary(Telecom_Data_yes)
## CustomerID Churn MonthlyRevenue MonthlyMinutes
## Min. :3000002 No : 0 Min. : 0 Min. : 0
## 1st Qu.:3099298 Yes:14711 1st Qu.: 33 1st Qu.: 132
## Median :3195614 Median : 48 Median : 330
## Mean :3194322 Mean : 58 Mean : 484
## 3rd Qu.:3286308 3rd Qu.: 70 3rd Qu.: 667
## Max. :3399978 Max. :861 Max. :5410
## NA's :70 NA's :70
## TotalRecurringCharge DirectorAssistedCalls OverageMinutes RoamingCalls
## Min. :-11 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 30 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0
## Median : 44 Median : 0.2 Median : 4 Median : 0
## Mean : 45 Mean : 0.8 Mean : 43 Mean : 1
## 3rd Qu.: 55 3rd Qu.: 0.7 3rd Qu.: 46 3rd Qu.: 0
## Max. :338 Max. :45.8 Max. :2018 Max. :851
## NA's :70 NA's :70 NA's :70 NA's :70
## PercChangeMinutes PercChangeRevenues DroppedCalls BlockedCalls
## Min. :-2868 Min. :-851 Min. : 0.0 Min. : 0.0
## 1st Qu.: -101 1st Qu.: -8 1st Qu.: 0.7 1st Qu.: 0.0
## Median : -11 Median : 0 Median : 3.0 Median : 1.0
## Mean : -25 Mean : 0 Mean : 5.8 Mean : 4.0
## 3rd Qu.: 54 3rd Qu.: 2 3rd Qu.: 7.3 3rd Qu.: 3.3
## Max. : 5192 Max. :2484 Max. :208.7 Max. :314.7
## NA's :208 NA's :208
## UnansweredCalls CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls
## Min. : 0 Min. : 0.0 Min. : 0.00 Min. : 0 Min. : 0
## 1st Qu.: 4 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 6 1st Qu.: 2
## Median : 15 Median : 0.0 Median : 0.00 Median : 45 Median : 12
## Mean : 26 Mean : 1.6 Mean : 0.26 Mean : 105 Mean : 24
## 3rd Qu.: 34 3rd Qu.: 1.3 3rd Qu.: 0.30 3rd Qu.: 140 3rd Qu.: 32
## Max. :849 Max. :172.3 Max. :30.00 Max. :2619 Max. :520
##
## InboundCalls PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls
## Min. : 0.0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 19 1st Qu.: 9 1st Qu.: 2
## Median : 1.7 Median : 58 Median : 31 Median : 5
## Mean : 7.3 Mean : 84 Mean : 62 Mean : 10
## 3rd Qu.: 8.0 3rd Qu.: 114 3rd Qu.: 80 3rd Qu.: 12
## Max. :298.3 Max. :1359 Max. :1314 Max. :329
##
## CallForwardingCalls CallWaitingCalls MonthsInService UniqueSubs
## Min. : 0.0 Min. : 0.0 Min. : 6 Min. : 1.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:12 1st Qu.: 1.0
## Median : 0.0 Median : 0.0 Median :17 Median : 1.0
## Mean : 0.0 Mean : 1.6 Mean :19 Mean : 1.6
## 3rd Qu.: 0.0 3rd Qu.: 1.3 3rd Qu.:24 3rd Qu.: 2.0
## Max. :33.7 Max. :135.7 Max. :61 Max. :196.0
##
## ActiveSubs ServiceArea Handsets HandsetModels
## Min. : 0.0 Length:14711 Min. : 1.00 Min. : 1.0
## 1st Qu.: 1.0 Class :character 1st Qu.: 1.00 1st Qu.: 1.0
## Median : 1.0 Mode :character Median : 1.00 Median : 1.0
## Mean : 1.4 Mean : 1.74 Mean : 1.5
## 3rd Qu.: 2.0 3rd Qu.: 2.00 3rd Qu.: 2.0
## Max. :53.0 Max. :22.00 Max. :14.0
##
## CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## Min. : -4 Min. : 0.0 Min. : 0.0 Length:14711
## 1st Qu.: 249 1st Qu.: 0.0 1st Qu.: 0.0 Class :character
## Median : 366 Median :34.0 Median : 0.0 Mode :character
## Mean : 422 Mean :30.3 Mean :20.4
## 3rd Qu.: 564 3rd Qu.:48.0 3rd Qu.:42.0
## Max. :1779 Max. :98.0 Max. :99.0
## NA's :249 NA's :249
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner
## Length:14711 Length:14711 Length:14711 Length:14711
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Homeownership BuysViaMailOrder RespondsToMailOffers OptOutMailings
## Length:14711 Length:14711 Length:14711 Length:14711
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## NonUSTravel OwnsComputer HasCreditCard RetentionCalls
## Length:14711 Length:14711 Length:14711 Min. :0.00
## Class :character Class :character Class :character 1st Qu.:0.00
## Mode :character Mode :character Mode :character Median :0.00
## Mean :0.06
## 3rd Qu.:0.00
## Max. :4.00
##
## RetentionOffersAccepted NewCellphoneUser NotNewCellphoneUser
## Min. :0.000 Length:14711 Length:14711
## 1st Qu.:0.000 Class :character Class :character
## Median :0.000 Mode :character Mode :character
## Mean :0.026
## 3rd Qu.:0.000
## Max. :3.000
##
## ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## Min. :0.00 Min. :0.00 Length:14711
## 1st Qu.:0.00 1st Qu.:0.00 Class :character
## Median :0.00 Median :5.00 Mode :character
## Mean :0.05 Mean :4.26
## 3rd Qu.:0.00 3rd Qu.:7.00
## Max. :9.00 Max. :9.00
##
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam
## Min. :0.00 Length:14711 Length:14711
## 1st Qu.:0.00 Class :character Class :character
## Median :0.00 Mode :character Mode :character
## Mean :0.04
## 3rd Qu.:0.00
## Max. :9.00
##
## CreditRating PrizmCode Occupation MaritalStatus
## 1-Highest:2628 Length:14711 Other :10932 Length:14711
## 2-High :5712 Class :character Professional: 2467 Class :character
## 3-Good :2608 Mode :character Crafts : 426 Mode :character
## 4-Medium :1399 Clerical : 289
## 5-Low :1436 Self : 243
## 6-VeryLow: 316 Retired : 185
## 7-Lowest : 612 (Other) : 169
## perc_recurrent_charge perc_overage_minute
## Min. :-30 Min. : 0
## 1st Qu.: 69 1st Qu.: 0
## Median : 94 Median : 1
## Mean : 88 Mean : 7
## 3rd Qu.:106 3rd Qu.: 10
## Max. :514 Max. :100
## NA's :72 NA's :512
summary(Telecom_Data_no)
## CustomerID Churn MonthlyRevenue MonthlyMinutes
## Min. :3000014 No :36336 Min. : -6 Min. : 0
## 1st Qu.:3101025 Yes: 0 1st Qu.: 34 1st Qu.: 170
## Median :3204388 Median : 49 Median : 381
## Mean :3205048 Mean : 59 Mean : 543
## 3rd Qu.:3313601 3rd Qu.: 72 3rd Qu.: 743
## Max. :3399994 Max. :1223 Max. :7359
## NA's :86 NA's :86
## TotalRecurringCharge DirectorAssistedCalls OverageMinutes RoamingCalls
## Min. : -9 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 30 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0
## Median : 45 Median : 0.2 Median : 2 Median : 0
## Mean : 48 Mean : 0.9 Mean : 39 Mean : 1
## 3rd Qu.: 60 3rd Qu.: 1.0 3rd Qu.: 39 3rd Qu.: 0
## Max. :400 Max. :159.4 Max. :4321 Max. :1112
## NA's :86 NA's :86 NA's :86 NA's :86
## PercChangeMinutes PercChangeRevenues DroppedCalls BlockedCalls
## Min. :-3875 Min. :-1108 Min. : 0.0 Min. : 0
## 1st Qu.: -78 1st Qu.: -7 1st Qu.: 1.0 1st Qu.: 0
## Median : -3 Median : 0 Median : 3.0 Median : 1
## Mean : -6 Mean : -1 Mean : 6.1 Mean : 4
## 3rd Qu.: 70 3rd Qu.: 2 3rd Qu.: 7.7 3rd Qu.: 4
## Max. : 4480 Max. : 1347 Max. :221.7 Max. :384
## NA's :159 NA's :159
## UnansweredCalls CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 6 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 10 1st Qu.: 4
## Median : 17 Median : 0 Median : 0.0 Median : 56 Median : 14
## Mean : 29 Mean : 2 Mean : 0.3 Mean : 119 Mean : 26
## 3rd Qu.: 37 3rd Qu.: 2 3rd Qu.: 0.3 3rd Qu.: 159 3rd Qu.: 35
## Max. :840 Max. :327 Max. :66.0 Max. :2692 Max. :644
##
## InboundCalls PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 25 1st Qu.: 12 1st Qu.: 2
## Median : 2 Median : 64 Median : 38 Median : 6
## Mean : 9 Mean : 93 Mean : 70 Mean : 10
## 3rd Qu.: 10 3rd Qu.: 124 3rd Qu.: 92 3rd Qu.: 13
## Max. :519 Max. :2091 Max. :1475 Max. :412
##
## CallForwardingCalls CallWaitingCalls MonthsInService UniqueSubs
## Min. : 0.0 Min. : 0.0 Min. : 6.0 Min. : 1.00
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:11.0 1st Qu.: 1.00
## Median : 0.0 Median : 0.3 Median :16.0 Median : 1.00
## Mean : 0.0 Mean : 1.9 Mean :18.6 Mean : 1.51
## 3rd Qu.: 0.0 3rd Qu.: 1.7 3rd Qu.:24.0 3rd Qu.: 2.00
## Max. :81.3 Max. :212.7 Max. :60.0 Max. :12.00
##
## ActiveSubs ServiceArea Handsets HandsetModels
## Min. : 0.00 Length:36336 Min. : 1.00 Min. : 1.00
## 1st Qu.: 1.00 Class :character 1st Qu.: 1.00 1st Qu.: 1.00
## Median : 1.00 Mode :character Median : 1.00 Median : 1.00
## Mean : 1.35 Mean : 1.83 Mean : 1.58
## 3rd Qu.: 2.00 3rd Qu.: 2.00 3rd Qu.: 2.00
## Max. :11.00 Max. :24.00 Max. :15.00
## NA's :1 NA's :1
## CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## Min. : -5 Min. : 0 Min. : 0 Length:36336
## 1st Qu.: 197 1st Qu.: 0 1st Qu.: 0 Class :character
## Median : 310 Median :36 Median : 0 Mode :character
## Mean : 364 Mean :32 Mean :21
## 3rd Qu.: 493 3rd Qu.:48 3rd Qu.:44
## Max. :1812 Max. :99 Max. :98
## NA's :1 NA's :660 NA's :660
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner
## Length:36336 Length:36336 Length:36336 Length:36336
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Homeownership BuysViaMailOrder RespondsToMailOffers OptOutMailings
## Length:36336 Length:36336 Length:36336 Length:36336
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## NonUSTravel OwnsComputer HasCreditCard RetentionCalls
## Length:36336 Length:36336 Length:36336 Min. :0.000
## Class :character Class :character Class :character 1st Qu.:0.000
## Mode :character Mode :character Mode :character Median :0.000
## Mean :0.029
## 3rd Qu.:0.000
## Max. :3.000
##
## RetentionOffersAccepted NewCellphoneUser NotNewCellphoneUser
## Min. :0.000 Length:36336 Length:36336
## 1st Qu.:0.000 Class :character Class :character
## Median :0.000 Mode :character Mode :character
## Mean :0.015
## 3rd Qu.:0.000
## Max. :3.000
##
## ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## Min. : 0.0 Min. :0.00 Length:36336
## 1st Qu.: 0.0 1st Qu.:1.00 Class :character
## Median : 0.0 Median :5.00 Mode :character
## Mean : 0.1 Mean :4.35
## 3rd Qu.: 0.0 3rd Qu.:7.00
## Max. :35.0 Max. :9.00
##
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam
## Min. : 0.00 Length:36336 Length:36336
## 1st Qu.: 0.00 Class :character Class :character
## Median : 0.00 Mode :character Mode :character
## Mean : 0.06
## 3rd Qu.: 0.00
## Max. :25.00
##
## CreditRating PrizmCode Occupation MaritalStatus
## 1-Highest: 5894 Length:36336 Other :26705 Length:36336
## 2-High :13281 Class :character Professional: 6288 Class :character
## 3-Good : 5802 Mode :character Crafts : 1093 Mode :character
## 4-Medium : 3958 Clerical : 697
## 5-Low : 5063 Self : 636
## 6-VeryLow: 836 Retired : 548
## 7-Lowest : 1502 (Other) : 369
## perc_recurrent_charge perc_overage_minute
## Min. :-28.1 Min. : 0
## 1st Qu.: 74.0 1st Qu.: 0
## Median : 96.1 Median : 1
## Mean : Inf Mean : 6
## 3rd Qu.:110.4 3rd Qu.: 8
## Max. : Inf Max. :100
## NA's :89 NA's :367
Data Classification
feat_typ_counts <- data.frame(read.csv("Feat_type_counts.csv"))
#install.packages("plotrix")
library(plotrix)
library("ggplot2")
#pie(feat_typ_counts$Counts, feat_typ_counts$Variable.Type)
piepercent<- round(100 * feat_typ_counts$Counts / sum(feat_typ_counts$Counts), 1)
feat_typ_counts$fraction <- feat_typ_counts$Counts / sum(feat_typ_counts$Counts)
# Compute the cumulative percentages (top of each rectangle)
feat_typ_counts$ymax <- cumsum(feat_typ_counts$fraction)
# Compute the bottom of each rectangle
feat_typ_counts$ymin <- c(0, head(feat_typ_counts$ymax, n=-1))
# Compute label position
feat_typ_counts$labelPosition <- (feat_typ_counts$ymax + feat_typ_counts$ymin) / 2
# Compute a good label
feat_typ_counts$label <- paste0(feat_typ_counts$Variable.Type, "\n Count: ", feat_typ_counts$Counts)
ggplot(feat_typ_counts, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Variable.Type)) +
geom_rect() +
geom_label( x=3.5, aes(y=labelPosition, label=label), size=2) +
scale_fill_brewer(palette=4) +
coord_polar(theta="y") +
xlim(c(2, 4)) +
theme_void() +
theme(legend.position = "none")
Getting summary of the data
xkablesummary(Telecom_Data)
| CustomerID | Churn | MonthlyRevenue | MonthlyMinutes | TotalRecurringCharge | DirectorAssistedCalls | OverageMinutes | RoamingCalls | PercChangeMinutes | PercChangeRevenues | DroppedCalls | BlockedCalls | UnansweredCalls | CustomerCareCalls | ThreewayCalls | ReceivedCalls | OutboundCalls | InboundCalls | PeakCallsInOut | OffPeakCallsInOut | DroppedBlockedCalls | CallForwardingCalls | CallWaitingCalls | MonthsInService | UniqueSubs | ActiveSubs | ServiceArea | Handsets | HandsetModels | CurrentEquipmentDays | AgeHH1 | AgeHH2 | ChildrenInHH | HandsetRefurbished | HandsetWebCapable | TruckOwner | RVOwner | Homeownership | BuysViaMailOrder | RespondsToMailOffers | OptOutMailings | NonUSTravel | OwnsComputer | HasCreditCard | RetentionCalls | RetentionOffersAccepted | NewCellphoneUser | NotNewCellphoneUser | ReferralsMadeBySubscriber | IncomeGroup | OwnsMotorcycle | AdjustmentsToCreditRating | HandsetPrice | MadeCallToRetentionTeam | CreditRating | PrizmCode | Occupation | MaritalStatus | perc_recurrent_charge | perc_overage_minute | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :3000002 | No :36336 | Min. : -6 | Min. : 0 | Min. :-11 | Min. : 0.0 | Min. : 0 | Min. : 0 | Min. :-3875 | Min. :-1108 | Min. : 0.0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0.0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0 | Min. : 0.0 | Min. : 0.0 | Min. : 6.0 | Min. : 1.0 | Min. : 0.0 | Length:51047 | Min. : 1.00 | Min. : 1.00 | Min. : -5 | Min. : 0 | Min. : 0 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Length:51047 | Min. :0.00 | Min. :0.000 | Length:51047 | Length:51047 | Min. : 0.0 | Min. :0.00 | Length:51047 | Min. : 0.00 | Length:51047 | Length:51047 | 1-Highest: 8522 | Length:51047 | Other :37637 | Length:51047 | Min. :-29.5 | Min. : 0 |
| Q1 | 1st Qu.:3100632 | Yes:14711 | 1st Qu.: 34 | 1st Qu.: 158 | 1st Qu.: 30 | 1st Qu.: 0.0 | 1st Qu.: 0 | 1st Qu.: 0 | 1st Qu.: -83 | 1st Qu.: -7 | 1st Qu.: 0.7 | 1st Qu.: 0 | 1st Qu.: 5 | 1st Qu.: 0 | 1st Qu.: 0.0 | 1st Qu.: 8 | 1st Qu.: 3 | 1st Qu.: 0 | 1st Qu.: 23 | 1st Qu.: 11 | 1st Qu.: 2 | 1st Qu.: 0.0 | 1st Qu.: 0.0 | 1st Qu.:11.0 | 1st Qu.: 1.0 | 1st Qu.: 1.0 | Class :character | 1st Qu.: 1.00 | 1st Qu.: 1.00 | 1st Qu.: 205 | 1st Qu.: 0 | 1st Qu.: 0 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:0.00 | 1st Qu.:0.000 | Class :character | Class :character | 1st Qu.: 0.0 | 1st Qu.:0.00 | Class :character | 1st Qu.: 0.00 | Class :character | Class :character | 2-High :18993 | Class :character | Professional: 8755 | Class :character | 1st Qu.: 72.6 | 1st Qu.: 0 |
| Median | Median :3201534 | NA | Median : 48 | Median : 366 | Median : 45 | Median : 0.2 | Median : 3 | Median : 0 | Median : -5 | Median : 0 | Median : 3.0 | Median : 1 | Median : 16 | Median : 0 | Median : 0.0 | Median : 53 | Median : 14 | Median : 2 | Median : 62 | Median : 36 | Median : 5 | Median : 0.0 | Median : 0.3 | Median :16.0 | Median : 1.0 | Median : 1.0 | Mode :character | Median : 1.00 | Median : 1.00 | Median : 329 | Median :36 | Median : 0 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :0.00 | Median :0.000 | Mode :character | Mode :character | Median : 0.0 | Median :5.00 | Mode :character | Median : 0.00 | Mode :character | Mode :character | 3-Good : 8410 | Mode :character | Crafts : 1519 | Mode :character | Median : 95.4 | Median : 1 |
| Mean | Mean :3201957 | NA | Mean : 59 | Mean : 526 | Mean : 47 | Mean : 0.9 | Mean : 40 | Mean : 1 | Mean : -12 | Mean : -1 | Mean : 6.0 | Mean : 4 | Mean : 28 | Mean : 2 | Mean : 0.3 | Mean : 115 | Mean : 25 | Mean : 8 | Mean : 91 | Mean : 68 | Mean : 10 | Mean : 0.0 | Mean : 1.8 | Mean :18.8 | Mean : 1.5 | Mean : 1.4 | NA | Mean : 1.81 | Mean : 1.56 | Mean : 381 | Mean :31 | Mean :21 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mean :0.04 | Mean :0.018 | NA | NA | Mean : 0.1 | Mean :4.32 | NA | Mean : 0.05 | NA | NA | 4-Medium : 5357 | NA | Clerical : 986 | NA | Mean : Inf | Mean : 6 |
| Q3 | 3rd Qu.:3305376 | NA | 3rd Qu.: 71 | 3rd Qu.: 723 | 3rd Qu.: 60 | 3rd Qu.: 1.0 | 3rd Qu.: 41 | 3rd Qu.: 0 | 3rd Qu.: 66 | 3rd Qu.: 2 | 3rd Qu.: 7.7 | 3rd Qu.: 4 | 3rd Qu.: 36 | 3rd Qu.: 2 | 3rd Qu.: 0.3 | 3rd Qu.: 154 | 3rd Qu.: 34 | 3rd Qu.: 9 | 3rd Qu.: 121 | 3rd Qu.: 89 | 3rd Qu.: 12 | 3rd Qu.: 0.0 | 3rd Qu.: 1.3 | 3rd Qu.:24.0 | 3rd Qu.: 2.0 | 3rd Qu.: 2.0 | NA | 3rd Qu.: 2.00 | 3rd Qu.: 2.00 | 3rd Qu.: 515 | 3rd Qu.:48 | 3rd Qu.:42 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:0.00 | 3rd Qu.:0.000 | NA | NA | 3rd Qu.: 0.0 | 3rd Qu.:7.00 | NA | 3rd Qu.: 0.00 | NA | NA | 5-Low : 6499 | NA | Self : 879 | NA | 3rd Qu.:109.3 | 3rd Qu.: 8 |
| Max | Max. :3399994 | NA | Max. :1223 | Max. :7359 | Max. :400 | Max. :159.4 | Max. :4321 | Max. :1112 | Max. : 5192 | Max. : 2484 | Max. :221.7 | Max. :384 | Max. :849 | Max. :327 | Max. :66.0 | Max. :2692 | Max. :644 | Max. :519 | Max. :2091 | Max. :1475 | Max. :412 | Max. :81.3 | Max. :212.7 | Max. :61.0 | Max. :196.0 | Max. :53.0 | NA | Max. :24.00 | Max. :15.00 | Max. :1812 | Max. :99 | Max. :99 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Max. :4.00 | Max. :3.000 | NA | NA | Max. :35.0 | Max. :9.00 | NA | Max. :25.00 | NA | NA | 6-VeryLow: 1152 | NA | Retired : 733 | NA | Max. : Inf | Max. :100 |
| NA | NA | NA | NA’s :156 | NA’s :156 | NA’s :156 | NA’s :156 | NA’s :156 | NA’s :156 | NA’s :367 | NA’s :367 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :1 | NA’s :1 | NA’s :1 | NA’s :909 | NA’s :909 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 7-Lowest : 2114 | NA | (Other) : 538 | NA | NA’s :161 | NA’s :879 |
Box plot of the Monnthly Minutes
boxplot(Telecom_Data$MonthlyMinutes,
main = "Monthly Minutes of Customers",
xlab = "Monthly Min",
ylab = "Frequency",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
##Current Headset use in days
plot_ly(Telecom_Data, y= Telecom_Data$CurrentEquipmentDays, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Boxplot of Total Recurring Charge
plot_ly(Telecom_Data, y= Telecom_Data$TotalRecurringCharge, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Box plot of Month in Service
plot_ly(Telecom_Data, y= Telecom_Data$MonthsInService, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Box plot of the Percent change in recurrent charge
plot_ly(Telecom_Data, y= Telecom_Data$perc_recurrent_charge, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Box plot of Percent change in Minutes
plot_ly(Telecom_Data, y= Telecom_Data$PercChangeMinutes, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Box plot of Percent change in Revenues
plot_ly(Telecom_Data, y= Telecom_Data$PercChangeRevenues, color = Telecom_Data$Churn, type = "box") %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Distribution of the Montly Revenue
library(ggplot2)
library(plotly)
set.seed(1)
gg <- ggplot(Telecom_Data,aes(x = MonthlyRevenue, color = 'density')) +
geom_histogram(aes(y = ..density..), bins = 7, fill = '#67B7D1', alpha = 0.5) +
geom_density(color = '#67B7D1') +
geom_rug(color = '#67B7D1') +
ylab("") +
xlab("") + theme(legend.title=element_blank()) +
scale_color_manual(values = c('density' = '#67B7D1'))
ggplotly(gg)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
title='Time',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
title='Monthly Revenue',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
Distribution of Monthly Minutes
library(ggplot2)
library(plotly)
set.seed(1)
gg <- ggplot(Telecom_Data,aes(x = MonthlyMinutes, color = 'density')) +
geom_histogram(aes(y = ..density..), bins = 7, fill = '#67B7D1', alpha = 0.5) +
geom_density(color = '#67B7D1') +
geom_rug(color = '#67B7D1') +
ylab("") +
xlab("") + theme(legend.title=element_blank()) +
scale_color_manual(values = c('density' = '#67B7D1'))
ggplotly(gg)%>%
layout(plot_bgcolor='#e5ecf6',
xaxis = list(
title='Monthly Minutes ',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
title='Frequency',
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
qqnorm(Telecom_Data$MonthlyMinutes) # QQplot
qqline(Telecom_Data$MonthlyMinutes, col = "red")
#install.packages("car")
#library("car")
#qqPlot(Telecom_Data$MonthlyMinutes)
library("plotly")
#plot_ly(Telecom_Data, y= Telecom_Data$AgeHH1, color = Telecom_Data$Churn, type = "box")
#layout(boxmode = "group",
# xaxis = list(title=''),
# yaxis = list(title='Frequency'))
How travel Affects churn rate
churn_count<-nrow(Telecom_Data$Churn)
ggplot(Telecom_Data,aes(x = NonUSTravel,fill=Churn )) +
geom_bar( position = "stack")+ggtitle("How Travel affects churn")
Do number of dropped call have affect on churn
ggplot(Telecom_Data, aes(x=DroppedCalls, fill=Churn)) + geom_histogram(position='identity',alpha=0.6)
Income group of customers
ggplot(Telecom_Data,aes(x=IncomeGroup, fill=Churn))+geom_histogram(position='identity',alpha=0.6)
How many customer opt out of mailing list
ggplot(Telecom_Data,aes(x=OptOutMailings,fill=Churn))+geom_bar(position='identity',alpha=0.6)
How does credit rating have an impact on Churn?
Boxplot for Credit Rating using ggplot
library(ggplot2)
ggplot(Telecom_Data, aes(y=CreditRating)) + geom_boxplot( colour="orange", fill="black") + ggtitle("Credit Rating using `ggplot`")
Subsetting Churned and Retained data
Churned <- subset(Telecom_Data, Churn=="Yes")
Retained <- subset(Telecom_Data, Churn=="No")
str(Churned)
## 'data.frame': 14711 obs. of 60 variables:
## $ CustomerID : int 3000002 3000010 3000026 3000082 3000122 3000158 3000174 3000182 3000190 3000194 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ MonthlyRevenue : num 24 17 17.1 172.4 24.5 ...
## $ MonthlyMinutes : int 219 10 0 1978 42 196 4 684 852 782 ...
## $ TotalRecurringCharge : int 22 17 17 100 17 30 17 55 85 25 ...
## $ DirectorAssistedCalls : num 0.25 0 0 0 0 0 0 0 0 0 ...
## $ OverageMinutes : int 0 0 0 362 10 0 0 3 0 233 ...
## $ RoamingCalls : num 0 0 0 0 0 2.6 0 0 0 0 ...
## $ PercChangeMinutes : int -157 -4 0 -1007 81 54 -4 -2 -206 -258 ...
## $ PercChangeRevenues : num -19 0 -0.2 -72.5 29.9 8.6 0 0.6 0 -80 ...
## $ DroppedCalls : num 0.7 0.3 0 7.3 0 6.7 0 15.7 10.7 1 ...
## $ BlockedCalls : num 0.7 0 0 18 0 0.3 1 1.3 6.3 0.7 ...
## $ UnansweredCalls : num 6.3 2.7 0 114.3 0 ...
## $ CustomerCareCalls : num 0 0 0 0.3 0 0 0 5 1 0.3 ...
## $ ThreewayCalls : num 0 0 0 0.7 0 0.7 0 0 0 0 ...
## $ ReceivedCalls : num 97.2 0 0 515.2 0 ...
## $ OutboundCalls : num 0 0 0 22.7 0 9.7 0 19.3 54.7 1.3 ...
## $ InboundCalls : num 0 0 0 2.7 0 5.7 0 0.3 29.7 0 ...
## $ PeakCallsInOut : num 58 5 0 718 3 ...
## $ OffPeakCallsInOut : num 24 1 0 60.3 0.7 ...
## $ DroppedBlockedCalls : num 1.3 0.3 0 25.3 0 7 1 17 17 1.7 ...
## $ CallForwardingCalls : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CallWaitingCalls : num 0.3 0 0 20.3 0 0 0 0.7 6 2.7 ...
## $ MonthsInService : int 61 58 53 58 58 54 55 55 53 55 ...
## $ UniqueSubs : int 2 1 2 2 2 2 1 2 5 1 ...
## $ ActiveSubs : int 1 1 2 1 1 1 1 1 4 1 ...
## $ ServiceArea : chr "SEAPOR503" "PITHOM412" "OKCTUL918" "LOULOU502" ...
## $ Handsets : int 2 2 4 4 3 4 1 2 8 5 ...
## $ HandsetModels : int 2 1 3 3 3 3 1 2 3 4 ...
## $ CurrentEquipmentDays : int 361 1504 852 143 776 179 1661 864 151 162 ...
## $ AgeHH1 : int 62 40 46 48 36 50 64 50 50 36 ...
## $ AgeHH2 : int 0 42 54 32 0 48 64 0 50 38 ...
## $ ChildrenInHH : chr "No" "Yes" "No" "No" ...
## $ HandsetRefurbished : chr "No" "No" "No" "No" ...
## $ HandsetWebCapable : chr "Yes" "No" "No" "Yes" ...
## $ TruckOwner : chr "No" "No" "No" "Yes" ...
## $ RVOwner : chr "No" "No" "No" "Yes" ...
## $ Homeownership : chr "Known" "Known" "Known" "Known" ...
## $ BuysViaMailOrder : chr "Yes" "Yes" "Yes" "Yes" ...
## $ RespondsToMailOffers : chr "Yes" "Yes" "Yes" "Yes" ...
## $ OptOutMailings : chr "No" "No" "No" "No" ...
## $ NonUSTravel : chr "No" "No" "No" "No" ...
## $ OwnsComputer : chr "Yes" "Yes" "Yes" "No" ...
## $ HasCreditCard : chr "Yes" "Yes" "Yes" "Yes" ...
## $ RetentionCalls : int 1 0 0 1 0 0 0 0 0 0 ...
## $ RetentionOffersAccepted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NewCellphoneUser : chr "No" "Yes" "No" "No" ...
## $ NotNewCellphoneUser : chr "No" "No" "Yes" "No" ...
## $ ReferralsMadeBySubscriber: int 0 0 0 0 0 0 0 0 0 0 ...
## $ IncomeGroup : int 4 5 9 6 9 9 6 9 5 7 ...
## $ OwnsMotorcycle : chr "No" "No" "No" "No" ...
## $ AdjustmentsToCreditRating: int 0 0 1 0 0 1 0 1 0 0 ...
## $ HandsetPrice : chr "30" "30" "10" "150" ...
## $ MadeCallToRetentionTeam : chr "Yes" "No" "No" "Yes" ...
## $ CreditRating : Factor w/ 7 levels "1-Highest","2-High",..: 1 4 1 1 1 1 1 3 1 3 ...
## $ PrizmCode : chr "Suburban" "Suburban" "Other" "Other" ...
## $ Occupation : Factor w/ 8 levels "Clerical","Crafts",..: 5 5 5 5 4 5 5 5 2 4 ...
## $ MaritalStatus : chr "No" "Yes" "Yes" "Unknown" ...
## $ perc_recurrent_charge : num 91.7 100.1 99.2 58 69.4 ...
## $ perc_overage_minute : num 0 0 NaN 18.3 23.8 ...
str(Retained)
## 'data.frame': 36336 obs. of 60 variables:
## $ CustomerID : int 3000014 3000022 3000030 3000038 3000042 3000046 3000050 3000054 3000058 3000062 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ MonthlyRevenue : num 38 82.3 38 31.7 62.1 ...
## $ MonthlyMinutes : int 8 1312 682 26 98 24 1056 2 1972 270 ...
## $ TotalRecurringCharge : int 38 75 52 30 66 35 75 25 85 37 ...
## $ DirectorAssistedCalls : num 0 1.24 0.25 0.25 2.48 0 0 0 2.23 0.25 ...
## $ OverageMinutes : int 0 0 0 0 0 0 0 0 250 6 ...
## $ RoamingCalls : num 0 0 0 0 0 0 0 0 35.5 0 ...
## $ PercChangeMinutes : int -2 157 148 60 24 20 43 -2 -200 202 ...
## $ PercChangeRevenues : num 0 8.1 -3.1 4 6.8 ...
## $ DroppedCalls : num 0 52 9 0 0 0 0 0 9 3.3 ...
## $ BlockedCalls : num 0 7.7 1.7 1 0.3 0 0 0 0 1.7 ...
## $ UnansweredCalls : num 0 76 13 2.3 4 1 0 0.3 43.7 7.7 ...
## $ CustomerCareCalls : num 0 4.3 0.7 0 4 0 0 0 0.3 1 ...
## $ ThreewayCalls : num 0 1.3 0 0 0 0 0 0 0 0 ...
## $ ReceivedCalls : num 0.4 200.3 42.2 0 0 ...
## $ OutboundCalls : num 0.3 370.3 6.7 0 3.7 ...
## $ InboundCalls : num 0 147 0 0 0 1.7 0 0 4.7 3.7 ...
## $ PeakCallsInOut : num 1.3 555.7 33.3 1.7 7.7 ...
## $ OffPeakCallsInOut : num 3.7 303.7 53 1.7 7.3 ...
## $ DroppedBlockedCalls : num 0 59.7 10.7 1 0.3 0 0 0 9 5 ...
## $ CallForwardingCalls : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CallWaitingCalls : num 0 22.7 0.7 0 0 0 0 0 1 0.3 ...
## $ MonthsInService : int 60 59 53 57 59 53 55 53 59 55 ...
## $ UniqueSubs : int 1 2 1 2 2 3 1 2 5 2 ...
## $ ActiveSubs : int 1 2 1 2 2 3 1 2 1 2 ...
## $ ServiceArea : chr "MILMIL414" "PITHOM412" "OKCTUL918" "OKCTUL918" ...
## $ Handsets : int 1 9 3 2 3 4 9 2 10 5 ...
## $ HandsetModels : int 1 4 2 2 3 3 5 2 6 4 ...
## $ CurrentEquipmentDays : int 1812 458 231 601 464 544 388 354 199 697 ...
## $ AgeHH1 : int 26 30 28 52 46 36 46 0 30 58 ...
## $ AgeHH2 : int 26 0 0 58 46 34 68 0 22 58 ...
## $ ChildrenInHH : chr "Yes" "No" "No" "No" ...
## $ HandsetRefurbished : chr "No" "No" "No" "No" ...
## $ HandsetWebCapable : chr "No" "Yes" "Yes" "Yes" ...
## $ TruckOwner : chr "No" "No" "No" "No" ...
## $ RVOwner : chr "No" "No" "No" "No" ...
## $ Homeownership : chr "Unknown" "Known" "Known" "Known" ...
## $ BuysViaMailOrder : chr "No" "Yes" "No" "Yes" ...
## $ RespondsToMailOffers : chr "No" "Yes" "No" "Yes" ...
## $ OptOutMailings : chr "No" "No" "No" "No" ...
## $ NonUSTravel : chr "No" "No" "Yes" "Yes" ...
## $ OwnsComputer : chr "No" "No" "No" "No" ...
## $ HasCreditCard : chr "Yes" "Yes" "Yes" "Yes" ...
## $ RetentionCalls : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RetentionOffersAccepted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NewCellphoneUser : chr "Yes" "Yes" "Yes" "No" ...
## $ NotNewCellphoneUser : chr "No" "No" "No" "Yes" ...
## $ ReferralsMadeBySubscriber: int 0 0 0 0 0 0 0 0 0 0 ...
## $ IncomeGroup : int 6 6 1 9 6 9 5 7 3 1 ...
## $ OwnsMotorcycle : chr "No" "No" "No" "No" ...
## $ AdjustmentsToCreditRating: int 0 0 1 1 0 0 1 0 1 1 ...
## $ HandsetPrice : chr "Unknown" "10" "30" "30" ...
## $ MadeCallToRetentionTeam : chr "No" "No" "No" "No" ...
## $ CreditRating : Factor w/ 7 levels "1-Highest","2-High",..: 3 4 3 1 1 1 3 1 4 3 ...
## $ PrizmCode : chr "Town" "Other" "Other" "Other" ...
## $ Occupation : Factor w/ 8 levels "Clerical","Crafts",..: 2 4 4 7 5 4 5 4 4 4 ...
## $ MaritalStatus : chr "Yes" "No" "Yes" "Yes" ...
## $ perc_recurrent_charge : num 100 91.2 136.7 94.8 106.2 ...
## $ perc_overage_minute : num 0 0 0 0 0 ...
Barplot for Credit Rating of Churned Data vs Count using ggplot
library(ggplot2)
ggplot(Churned, aes(x = CreditRating)) + geom_bar(col="black", fill="red", alpha=0.4) + ggtitle("Credit Rating for Churned Telecom Data") + labs(x="Credit Rating (x-axis)", y=" Count (y-axis)") + ylim(0,15000) + theme_classic()
Barplot for Credit Rating of Retained Data vs Count using ggplot
library(ggplot2)
ggplot(Retained, aes(x = CreditRating)) + geom_bar(col="black", fill="aquamarine3", alpha=0.6) + ggtitle("Credit Rating for Retained Telecom Data") + labs(x="Credit Rating (x-axis)", y=" Count (y-axis)")+ ylim(0,15000) + theme_classic()
Histogram for representing Age of Customers in Telecom Data
library(plotly)
ggplot(Telecom_Data, aes(x=AgeHH1))+ geom_histogram(color="aquamarine4",fill = "aquamarine3",alpha=0.6, bins=30) + labs(x="Age of Customers", y="Frequency",
title="Histogram of Customer Age") + theme_classic()
Filtering Age of Primary Users
library(dplyr)
AgeFiltered = filter(Telecom_Data, AgeHH1== 0)
nrow(AgeFiltered)
## [1] 13917
(13917/nrow(Telecom_Data))*100
## [1] 27.3
Boxplot Representing Customer Age Group in Telecom Data using ggplot
library(ggplot2)
ggplot(Telecom_Data, aes(y=AgeHH1)) + geom_boxplot( colour="maroon", fill="aquamarine3",alpha=0.6) + ggtitle("Boxplot of Customer Age group`") + labs(x="Age of Customers", y=" Frequency") + theme_classic()
Boxplot Representing Customer Age Group in Telecom Data using plotly
library(plotly)
plot_ly(Telecom_Data, y= Telecom_Data$AgeHH1,type = "box", color = Telecom_Data$Churn) %>%
layout(boxmode = "group",
xaxis = list(title=''),
yaxis = list(title='Frequency'))
Creating a subset for Churned and retained customers
#Created a subset for Churned and Retained data
Churned <- subset(Telecom_Data, Churn=="Yes")
Retained <- subset(Telecom_Data, Churn=="No")
Analyzing the churn rate with respect to months in service
# Histogram for relationship between months in service and Churn
ggplot(Churned, aes(x=MonthsInService, fill=Churn)) + geom_histogram(position='identity',alpha=0.6,color='aquamarine4',fill='aquamarine3')+xlab("Service period for churned customers (In Months) ")+ylab("Frequency") + theme_classic()+ggtitle("Service Months Distribution for Churned customers")
Mean_MonthsInService=mean(Churned$MonthsInService)
print(paste("Mean of service months of the customer:",Mean_MonthsInService))
## [1] "Mean of service months of the customer: 19.0443205764394"
Median_MonthsInService=median(Churned$MonthsInService)
print(paste("Median of service months of the customer:",Median_MonthsInService))
## [1] "Median of service months of the customer: 17"
Analyzing Prizm Codes effect on Churn rate
# Barplot for Prizm Code effect on Churn
ggplot(Telecom_Data, aes(x=PrizmCode, fill = Churn)) +geom_bar(position = "dodge2")+ggtitle("Churn distribution for Prizm code")
#install.packages("plotly")
library(plotly)
colors <- c('rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')
fig <- plot_ly(type='pie', labels=Churned$PrizmCode, values=Churned$n,
textinfo='label+percent',
insidetextorientation='radial',marker = list(colors = "Set1"),
line = list(color = '#FFFFFF', width = 1))
fig
fig_1 <- plot_ly(type='pie', labels=Retained$PrizmCode, values=Retained$n,
textinfo='label+percent',
insidetextorientation='radial',marker = list(colors = "Set1"),
line = list(color = '#FFFFFF', width = 1))
fig_1
Checking if occupation has any effect on Churn
#Frequency distribution of Occupation
ggplot(Telecom_Data,aes(x=Occupation)) + geom_bar(fill = "bisque") + ggtitle("Frequency distribution of occupation")
#Creating a contingency table for Occupation and Churn
Occupation_Churn<-table(Telecom_Data$Occupation,Telecom_Data$Churn)
str(Occupation_Churn)
## 'table' int [1:8, 1:2] 697 1093 106 26705 6288 548 636 263 289 426 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:8] "Clerical" "Crafts" "Homemaker" "Other" ...
## ..$ : chr [1:2] "No" "Yes"
#Performing Chi Square Test to check if occupation is independent of churn
chisq_test=chisq.test(Occupation_Churn)
chisq_test
##
## Pearson's Chi-squared test
##
## data: Occupation_Churn
## X-squared = 10, df = 7, p-value = 0.2
p_value=chisq_test$p.value
print(paste("The p value is:",p_value))
## [1] "The p value is: 0.171354265327925"
p value is greater than 0.05. Hence we will be accepting the null hypothesis, H0. Therefore we can say that occupation is independent of churn.
Telecom_Data <- read.csv("Telecom Data.csv")
typeof(Telecom_Data)
## [1] "list"
head(Telecom_Data)
## CustomerID Churn MonthlyRevenue MonthlyMinutes TotalRecurringCharge
## 1 3000002 Yes 24.0 219 22
## 2 3000010 Yes 17.0 10 17
## 3 3000014 No 38.0 8 38
## 4 3000022 No 82.3 1312 75
## 5 3000026 Yes 17.1 0 17
## 6 3000030 No 38.0 682 52
## DirectorAssistedCalls OverageMinutes RoamingCalls PercChangeMinutes
## 1 0.25 0 0 -157
## 2 0.00 0 0 -4
## 3 0.00 0 0 -2
## 4 1.24 0 0 157
## 5 0.00 0 0 0
## 6 0.25 0 0 148
## PercChangeRevenues DroppedCalls BlockedCalls UnansweredCalls
## 1 -19.0 0.7 0.7 6.3
## 2 0.0 0.3 0.0 2.7
## 3 0.0 0.0 0.0 0.0
## 4 8.1 52.0 7.7 76.0
## 5 -0.2 0.0 0.0 0.0
## 6 -3.1 9.0 1.7 13.0
## CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls InboundCalls
## 1 0.0 0.0 97.2 0.0 0
## 2 0.0 0.0 0.0 0.0 0
## 3 0.0 0.0 0.4 0.3 0
## 4 4.3 1.3 200.3 370.3 147
## 5 0.0 0.0 0.0 0.0 0
## 6 0.7 0.0 42.2 6.7 0
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls CallForwardingCalls
## 1 58.0 24.0 1.3 0
## 2 5.0 1.0 0.3 0
## 3 1.3 3.7 0.0 0
## 4 555.7 303.7 59.7 0
## 5 0.0 0.0 0.0 0
## 6 33.3 53.0 10.7 0
## CallWaitingCalls MonthsInService UniqueSubs ActiveSubs ServiceArea Handsets
## 1 0.3 61 2 1 SEAPOR503 2
## 2 0.0 58 1 1 PITHOM412 2
## 3 0.0 60 1 1 MILMIL414 1
## 4 22.7 59 2 2 PITHOM412 9
## 5 0.0 53 2 2 OKCTUL918 4
## 6 0.7 53 1 1 OKCTUL918 3
## HandsetModels CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## 1 2 361 62 0 No
## 2 1 1504 40 42 Yes
## 3 1 1812 26 26 Yes
## 4 4 458 30 0 No
## 5 3 852 46 54 No
## 6 2 231 28 0 No
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner Homeownership
## 1 No Yes No No Known
## 2 No No No No Known
## 3 No No No No Unknown
## 4 No Yes No No Known
## 5 No No No No Known
## 6 No Yes No No Known
## BuysViaMailOrder RespondsToMailOffers OptOutMailings NonUSTravel OwnsComputer
## 1 Yes Yes No No Yes
## 2 Yes Yes No No Yes
## 3 No No No No No
## 4 Yes Yes No No No
## 5 Yes Yes No No Yes
## 6 No No No Yes No
## HasCreditCard RetentionCalls RetentionOffersAccepted NewCellphoneUser
## 1 Yes 1 0 No
## 2 Yes 0 0 Yes
## 3 Yes 0 0 Yes
## 4 Yes 0 0 Yes
## 5 Yes 0 0 No
## 6 Yes 0 0 Yes
## NotNewCellphoneUser ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## 1 No 0 4 No
## 2 No 0 5 No
## 3 No 0 6 No
## 4 No 0 6 No
## 5 Yes 0 9 No
## 6 No 0 1 No
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam CreditRating
## 1 0 30 Yes 1-Highest
## 2 0 30 No 4-Medium
## 3 0 Unknown No 3-Good
## 4 0 10 No 4-Medium
## 5 1 10 No 1-Highest
## 6 1 30 No 3-Good
## PrizmCode Occupation MaritalStatus
## 1 Suburban Professional No
## 2 Suburban Professional Yes
## 3 Town Crafts Yes
## 4 Other Other No
## 5 Other Professional Yes
## 6 Other Other Yes
Correlation Analysis of Monthly Revenue and Overage Minutes for churned customers.
churndata <- subset(Telecom_Data, Telecom_Data$Churn == "Yes")
head(churndata)
## CustomerID Churn MonthlyRevenue MonthlyMinutes TotalRecurringCharge
## 1 3000002 Yes 24.0 219 22
## 2 3000010 Yes 17.0 10 17
## 5 3000026 Yes 17.1 0 17
## 16 3000082 Yes 172.4 1978 100
## 19 3000122 Yes 24.5 42 17
## 26 3000158 Yes 33.5 196 30
## DirectorAssistedCalls OverageMinutes RoamingCalls PercChangeMinutes
## 1 0.25 0 0.0 -157
## 2 0.00 0 0.0 -4
## 5 0.00 0 0.0 0
## 16 0.00 362 0.0 -1007
## 19 0.00 10 0.0 81
## 26 0.00 0 2.6 54
## PercChangeRevenues DroppedCalls BlockedCalls UnansweredCalls
## 1 -19.0 0.7 0.7 6.3
## 2 0.0 0.3 0.0 2.7
## 5 -0.2 0.0 0.0 0.0
## 16 -72.5 7.3 18.0 114.3
## 19 29.9 0.0 0.0 0.0
## 26 8.6 6.7 0.3 6.3
## CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls InboundCalls
## 1 0.0 0.0 97.2 0.0 0.0
## 2 0.0 0.0 0.0 0.0 0.0
## 5 0.0 0.0 0.0 0.0 0.0
## 16 0.3 0.7 515.2 22.7 2.7
## 19 0.0 0.0 0.0 0.0 0.0
## 26 0.0 0.7 74.1 9.7 5.7
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls CallForwardingCalls
## 1 58 24.0 1.3 0
## 2 5 1.0 0.3 0
## 5 0 0.0 0.0 0
## 16 718 60.3 25.3 0
## 19 3 0.7 0.0 0
## 26 38 19.3 7.0 0
## CallWaitingCalls MonthsInService UniqueSubs ActiveSubs ServiceArea Handsets
## 1 0.3 61 2 1 SEAPOR503 2
## 2 0.0 58 1 1 PITHOM412 2
## 5 0.0 53 2 2 OKCTUL918 4
## 16 20.3 58 2 1 LOULOU502 4
## 19 0.0 58 2 1 KCYKCK913 3
## 26 0.0 54 2 1 DENDEN303 4
## HandsetModels CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## 1 2 361 62 0 No
## 2 1 1504 40 42 Yes
## 5 3 852 46 54 No
## 16 3 143 48 32 No
## 19 3 776 36 0 No
## 26 3 179 50 48 Yes
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner Homeownership
## 1 No Yes No No Known
## 2 No No No No Known
## 5 No No No No Known
## 16 No Yes Yes Yes Known
## 19 No Yes No No Known
## 26 Yes Yes Yes Yes Known
## BuysViaMailOrder RespondsToMailOffers OptOutMailings NonUSTravel
## 1 Yes Yes No No
## 2 Yes Yes No No
## 5 Yes Yes No No
## 16 Yes Yes No No
## 19 No No No No
## 26 Yes Yes No No
## OwnsComputer HasCreditCard RetentionCalls RetentionOffersAccepted
## 1 Yes Yes 1 0
## 2 Yes Yes 0 0
## 5 Yes Yes 0 0
## 16 No Yes 1 0
## 19 No No 0 0
## 26 No Yes 0 0
## NewCellphoneUser NotNewCellphoneUser ReferralsMadeBySubscriber IncomeGroup
## 1 No No 0 4
## 2 Yes No 0 5
## 5 No Yes 0 9
## 16 No No 0 6
## 19 No Yes 0 9
## 26 Yes No 0 9
## OwnsMotorcycle AdjustmentsToCreditRating HandsetPrice
## 1 No 0 30
## 2 No 0 30
## 5 No 1 10
## 16 No 0 150
## 19 No 0 30
## 26 No 1 30
## MadeCallToRetentionTeam CreditRating PrizmCode Occupation MaritalStatus
## 1 Yes 1-Highest Suburban Professional No
## 2 No 4-Medium Suburban Professional Yes
## 5 No 1-Highest Other Professional Yes
## 16 Yes 1-Highest Other Professional Unknown
## 19 No 1-Highest Other Other No
## 26 No 1-Highest Suburban Professional Yes
nrow(churndata)
## [1] 14711
sum(is.na(churndata$MonthlyRevenue))
## [1] 70
sum(is.na(churndata$OverageMinutes))
## [1] 70
We can see there are 70 NA values in monthly revenue and overageminutes columns when churn is yes Lets remove those NA values
churndata2 <- na.omit(churndata)
head(churndata2)
## CustomerID Churn MonthlyRevenue MonthlyMinutes TotalRecurringCharge
## 1 3000002 Yes 24.0 219 22
## 2 3000010 Yes 17.0 10 17
## 5 3000026 Yes 17.1 0 17
## 16 3000082 Yes 172.4 1978 100
## 19 3000122 Yes 24.5 42 17
## 26 3000158 Yes 33.5 196 30
## DirectorAssistedCalls OverageMinutes RoamingCalls PercChangeMinutes
## 1 0.25 0 0.0 -157
## 2 0.00 0 0.0 -4
## 5 0.00 0 0.0 0
## 16 0.00 362 0.0 -1007
## 19 0.00 10 0.0 81
## 26 0.00 0 2.6 54
## PercChangeRevenues DroppedCalls BlockedCalls UnansweredCalls
## 1 -19.0 0.7 0.7 6.3
## 2 0.0 0.3 0.0 2.7
## 5 -0.2 0.0 0.0 0.0
## 16 -72.5 7.3 18.0 114.3
## 19 29.9 0.0 0.0 0.0
## 26 8.6 6.7 0.3 6.3
## CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls InboundCalls
## 1 0.0 0.0 97.2 0.0 0.0
## 2 0.0 0.0 0.0 0.0 0.0
## 5 0.0 0.0 0.0 0.0 0.0
## 16 0.3 0.7 515.2 22.7 2.7
## 19 0.0 0.0 0.0 0.0 0.0
## 26 0.0 0.7 74.1 9.7 5.7
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls CallForwardingCalls
## 1 58 24.0 1.3 0
## 2 5 1.0 0.3 0
## 5 0 0.0 0.0 0
## 16 718 60.3 25.3 0
## 19 3 0.7 0.0 0
## 26 38 19.3 7.0 0
## CallWaitingCalls MonthsInService UniqueSubs ActiveSubs ServiceArea Handsets
## 1 0.3 61 2 1 SEAPOR503 2
## 2 0.0 58 1 1 PITHOM412 2
## 5 0.0 53 2 2 OKCTUL918 4
## 16 20.3 58 2 1 LOULOU502 4
## 19 0.0 58 2 1 KCYKCK913 3
## 26 0.0 54 2 1 DENDEN303 4
## HandsetModels CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## 1 2 361 62 0 No
## 2 1 1504 40 42 Yes
## 5 3 852 46 54 No
## 16 3 143 48 32 No
## 19 3 776 36 0 No
## 26 3 179 50 48 Yes
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner Homeownership
## 1 No Yes No No Known
## 2 No No No No Known
## 5 No No No No Known
## 16 No Yes Yes Yes Known
## 19 No Yes No No Known
## 26 Yes Yes Yes Yes Known
## BuysViaMailOrder RespondsToMailOffers OptOutMailings NonUSTravel
## 1 Yes Yes No No
## 2 Yes Yes No No
## 5 Yes Yes No No
## 16 Yes Yes No No
## 19 No No No No
## 26 Yes Yes No No
## OwnsComputer HasCreditCard RetentionCalls RetentionOffersAccepted
## 1 Yes Yes 1 0
## 2 Yes Yes 0 0
## 5 Yes Yes 0 0
## 16 No Yes 1 0
## 19 No No 0 0
## 26 No Yes 0 0
## NewCellphoneUser NotNewCellphoneUser ReferralsMadeBySubscriber IncomeGroup
## 1 No No 0 4
## 2 Yes No 0 5
## 5 No Yes 0 9
## 16 No No 0 6
## 19 No Yes 0 9
## 26 Yes No 0 9
## OwnsMotorcycle AdjustmentsToCreditRating HandsetPrice
## 1 No 0 30
## 2 No 0 30
## 5 No 1 10
## 16 No 0 150
## 19 No 0 30
## 26 No 1 30
## MadeCallToRetentionTeam CreditRating PrizmCode Occupation MaritalStatus
## 1 Yes 1-Highest Suburban Professional No
## 2 No 4-Medium Suburban Professional Yes
## 5 No 1-Highest Other Professional Yes
## 16 Yes 1-Highest Other Professional Unknown
## 19 No 1-Highest Other Other No
## 26 No 1-Highest Suburban Professional Yes
nrow(churndata2)
## [1] 14257
sum(is.na(churndata2$MonthlyRevenue))
## [1] 0
sum(is.na(churndata2$OverageMinutes))
## [1] 0
We have removed all the null values.
cor.test(churndata2$MonthlyRevenue, churndata2$OverageMinutes, method='spearman')
##
## Spearman's rank correlation rho
##
## data: churndata2$MonthlyRevenue and churndata2$OverageMinutes
## S = 2e+11, p-value <2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.591
The output is 0.5911 which is approximately 0.6. As the sign is positive, we can say that monthly revenue and overage minutes vary postively when there is churn. That is, the as the overage minutes increases, monthly revenue also increases.
#Plot with statistical results
library(ggplot2)
ggplot(data = churndata2) +
geom_smooth(mapping = aes(x = OverageMinutes, y = MonthlyRevenue, color="Brown")) + theme_classic()+
xlab("Overage minutes used by the customer") + ylab("Monthly revenue of the Telecom company ")
From the graph we can see that as Overage Minutes increased, Monthly
revenue also increased.
#install.packages("ggpubr")
library(ggpubr)
ggscatter(data = churndata2, x = "OverageMinutes", y = "MonthlyRevenue",
conf.int = TRUE, color="brown", xlab="Overage Minutes used by customer", ylab="Monthly Revenue of the Telecom company", title="Scatter plot of Overage Minutes vs Monthly Revenue for churned customers")
The scatter graph in the aforementioned section illustrates the positive
correlation between customer overage minutes consumed and the telecom
sector’s monthly income for customers who churn.
Now lets plot when there is no churn, that is when there is retention of customers.
retentiondata <- subset(Telecom_Data, Telecom_Data$Churn == "No")
head(retentiondata)
## CustomerID Churn MonthlyRevenue MonthlyMinutes TotalRecurringCharge
## 3 3000014 No 38.0 8 38
## 4 3000022 No 82.3 1312 75
## 6 3000030 No 38.0 682 52
## 7 3000038 No 31.7 26 30
## 8 3000042 No 62.1 98 66
## 9 3000046 No 35.3 24 35
## DirectorAssistedCalls OverageMinutes RoamingCalls PercChangeMinutes
## 3 0.00 0 0 -2
## 4 1.24 0 0 157
## 6 0.25 0 0 148
## 7 0.25 0 0 60
## 8 2.48 0 0 24
## 9 0.00 0 0 20
## PercChangeRevenues DroppedCalls BlockedCalls UnansweredCalls
## 3 0.0 0 0.0 0.0
## 4 8.1 52 7.7 76.0
## 6 -3.1 9 1.7 13.0
## 7 4.0 0 1.0 2.3
## 8 6.8 0 0.3 4.0
## 9 -0.3 0 0.0 1.0
## CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls InboundCalls
## 3 0.0 0.0 0.4 0.3 0.0
## 4 4.3 1.3 200.3 370.3 147.0
## 6 0.7 0.0 42.2 6.7 0.0
## 7 0.0 0.0 0.0 0.0 0.0
## 8 4.0 0.0 0.0 3.7 0.0
## 9 0.0 0.0 2.4 4.0 1.7
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls CallForwardingCalls
## 3 1.3 3.7 0.0 0
## 4 555.7 303.7 59.7 0
## 6 33.3 53.0 10.7 0
## 7 1.7 1.7 1.0 0
## 8 7.7 7.3 0.3 0
## 9 9.3 1.7 0.0 0
## CallWaitingCalls MonthsInService UniqueSubs ActiveSubs ServiceArea Handsets
## 3 0.0 60 1 1 MILMIL414 1
## 4 22.7 59 2 2 PITHOM412 9
## 6 0.7 53 1 1 OKCTUL918 3
## 7 0.0 57 2 2 OKCTUL918 2
## 8 0.0 59 2 2 OKCOKC405 3
## 9 0.0 53 3 3 SANMCA210 4
## HandsetModels CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## 3 1 1812 26 26 Yes
## 4 4 458 30 0 No
## 6 2 231 28 0 No
## 7 2 601 52 58 No
## 8 3 464 46 46 Yes
## 9 3 544 36 34 Yes
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner Homeownership
## 3 No No No No Unknown
## 4 No Yes No No Known
## 6 No Yes No No Known
## 7 No Yes No No Known
## 8 No Yes No No Known
## 9 No Yes No No Known
## BuysViaMailOrder RespondsToMailOffers OptOutMailings NonUSTravel OwnsComputer
## 3 No No No No No
## 4 Yes Yes No No No
## 6 No No No Yes No
## 7 Yes Yes No Yes No
## 8 Yes Yes No No No
## 9 No No No No No
## HasCreditCard RetentionCalls RetentionOffersAccepted NewCellphoneUser
## 3 Yes 0 0 Yes
## 4 Yes 0 0 Yes
## 6 Yes 0 0 Yes
## 7 Yes 0 0 No
## 8 Yes 0 0 Yes
## 9 Yes 0 0 Yes
## NotNewCellphoneUser ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## 3 No 0 6 No
## 4 No 0 6 No
## 6 No 0 1 No
## 7 Yes 0 9 No
## 8 No 0 6 No
## 9 No 0 9 No
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam CreditRating
## 3 0 Unknown No 3-Good
## 4 0 10 No 4-Medium
## 6 1 30 No 3-Good
## 7 1 30 No 1-Highest
## 8 0 30 No 1-Highest
## 9 0 80 No 1-Highest
## PrizmCode Occupation MaritalStatus
## 3 Town Crafts Yes
## 4 Other Other No
## 6 Other Other Yes
## 7 Other Self Yes
## 8 Other Professional No
## 9 Other Other Yes
nrow(retentiondata)
## [1] 36336
sum(is.na(retentiondata$MonthlyRevenue))
## [1] 86
sum(is.na(retentiondata$OverageMinutes))
## [1] 86
We can see there are 86 NA values in monthly revenue and overageminutes columns when there is no churn. Lets remove those NA values
retentiondata2 <- na.omit(retentiondata)
head(retentiondata2)
## CustomerID Churn MonthlyRevenue MonthlyMinutes TotalRecurringCharge
## 3 3000014 No 38.0 8 38
## 4 3000022 No 82.3 1312 75
## 6 3000030 No 38.0 682 52
## 7 3000038 No 31.7 26 30
## 8 3000042 No 62.1 98 66
## 9 3000046 No 35.3 24 35
## DirectorAssistedCalls OverageMinutes RoamingCalls PercChangeMinutes
## 3 0.00 0 0 -2
## 4 1.24 0 0 157
## 6 0.25 0 0 148
## 7 0.25 0 0 60
## 8 2.48 0 0 24
## 9 0.00 0 0 20
## PercChangeRevenues DroppedCalls BlockedCalls UnansweredCalls
## 3 0.0 0 0.0 0.0
## 4 8.1 52 7.7 76.0
## 6 -3.1 9 1.7 13.0
## 7 4.0 0 1.0 2.3
## 8 6.8 0 0.3 4.0
## 9 -0.3 0 0.0 1.0
## CustomerCareCalls ThreewayCalls ReceivedCalls OutboundCalls InboundCalls
## 3 0.0 0.0 0.4 0.3 0.0
## 4 4.3 1.3 200.3 370.3 147.0
## 6 0.7 0.0 42.2 6.7 0.0
## 7 0.0 0.0 0.0 0.0 0.0
## 8 4.0 0.0 0.0 3.7 0.0
## 9 0.0 0.0 2.4 4.0 1.7
## PeakCallsInOut OffPeakCallsInOut DroppedBlockedCalls CallForwardingCalls
## 3 1.3 3.7 0.0 0
## 4 555.7 303.7 59.7 0
## 6 33.3 53.0 10.7 0
## 7 1.7 1.7 1.0 0
## 8 7.7 7.3 0.3 0
## 9 9.3 1.7 0.0 0
## CallWaitingCalls MonthsInService UniqueSubs ActiveSubs ServiceArea Handsets
## 3 0.0 60 1 1 MILMIL414 1
## 4 22.7 59 2 2 PITHOM412 9
## 6 0.7 53 1 1 OKCTUL918 3
## 7 0.0 57 2 2 OKCTUL918 2
## 8 0.0 59 2 2 OKCOKC405 3
## 9 0.0 53 3 3 SANMCA210 4
## HandsetModels CurrentEquipmentDays AgeHH1 AgeHH2 ChildrenInHH
## 3 1 1812 26 26 Yes
## 4 4 458 30 0 No
## 6 2 231 28 0 No
## 7 2 601 52 58 No
## 8 3 464 46 46 Yes
## 9 3 544 36 34 Yes
## HandsetRefurbished HandsetWebCapable TruckOwner RVOwner Homeownership
## 3 No No No No Unknown
## 4 No Yes No No Known
## 6 No Yes No No Known
## 7 No Yes No No Known
## 8 No Yes No No Known
## 9 No Yes No No Known
## BuysViaMailOrder RespondsToMailOffers OptOutMailings NonUSTravel OwnsComputer
## 3 No No No No No
## 4 Yes Yes No No No
## 6 No No No Yes No
## 7 Yes Yes No Yes No
## 8 Yes Yes No No No
## 9 No No No No No
## HasCreditCard RetentionCalls RetentionOffersAccepted NewCellphoneUser
## 3 Yes 0 0 Yes
## 4 Yes 0 0 Yes
## 6 Yes 0 0 Yes
## 7 Yes 0 0 No
## 8 Yes 0 0 Yes
## 9 Yes 0 0 Yes
## NotNewCellphoneUser ReferralsMadeBySubscriber IncomeGroup OwnsMotorcycle
## 3 No 0 6 No
## 4 No 0 6 No
## 6 No 0 1 No
## 7 Yes 0 9 No
## 8 No 0 6 No
## 9 No 0 9 No
## AdjustmentsToCreditRating HandsetPrice MadeCallToRetentionTeam CreditRating
## 3 0 Unknown No 3-Good
## 4 0 10 No 4-Medium
## 6 1 30 No 3-Good
## 7 1 30 No 1-Highest
## 8 0 30 No 1-Highest
## 9 0 80 No 1-Highest
## PrizmCode Occupation MaritalStatus
## 3 Town Crafts Yes
## 4 Other Other No
## 6 Other Other Yes
## 7 Other Self Yes
## 8 Other Professional No
## 9 Other Other Yes
nrow(retentiondata2)
## [1] 35519
sum(is.na(retentiondata2$MonthlyRevenue))
## [1] 0
sum(is.na(retentiondata2$OverageMinutes))
## [1] 0
We have removed all the null values. Now lets do correlation
cor.test(retentiondata2$MonthlyRevenue, retentiondata2$OverageMinutes, method='spearman')
##
## Spearman's rank correlation rho
##
## data: retentiondata2$MonthlyRevenue and retentiondata2$OverageMinutes
## S = 3e+12, p-value <2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.564
Even when there is retention the output is 0.56, we can say the that Monthly revenue varies postively as overage minutes varies.
#Plot with statistical results
library(ggplot2)
ggplot(data = retentiondata2)+
geom_smooth(mapping = aes(x = OverageMinutes, y = MonthlyRevenue), color="dark green" ) + theme_classic() + xlab("Overage minutes used by the customer") + ylab("Monthly revenue of the Telecom company ")
From the graph we can see that as Overage Minutes increases, Monthly revenue also increases.
#install.packages("ggpubr")
library(ggpubr)
ggscatter(data = retentiondata2, x = "OverageMinutes", y = "MonthlyRevenue",
conf.int = TRUE, color="dark green", xlab="Overage Minutes used by customer", ylab="Monthly Revenue of the Telecom company", title="Scatter plot of Overage Minutes vs Monthly Revenue for retention customers")
The graph shows that even for customers who are still with the company, monthly revenue rises as Overage Minutes rise. Also, we can observe that outliers are more for churned customers compared to non-churn customers
datacorr <- Telecom_Data[ , c("MonthlyRevenue","MonthlyMinutes", "TotalRecurringCharge","CustomerCareCalls","ThreewayCalls","ReceivedCalls","OutboundCalls","MonthsInService","HandsetPrice","CreditRating")]
head(datacorr)
## MonthlyRevenue MonthlyMinutes TotalRecurringCharge CustomerCareCalls
## 1 24.0 219 22 0.0
## 2 17.0 10 17 0.0
## 3 38.0 8 38 0.0
## 4 82.3 1312 75 4.3
## 5 17.1 0 17 0.0
## 6 38.0 682 52 0.7
## ThreewayCalls ReceivedCalls OutboundCalls MonthsInService HandsetPrice
## 1 0.0 97.2 0.0 61 30
## 2 0.0 0.0 0.0 58 30
## 3 0.0 0.4 0.3 60 Unknown
## 4 1.3 200.3 370.3 59 10
## 5 0.0 0.0 0.0 53 10
## 6 0.0 42.2 6.7 53 30
## CreditRating
## 1 1-Highest
## 2 4-Medium
## 3 3-Good
## 4 4-Medium
## 5 1-Highest
## 6 3-Good
nrow(datacorr)
## [1] 51047
sum(is.na(datacorr))
## [1] 468
There are 468 NA values in the subsetted dataset.
datacorr2 <- na.omit(datacorr)
nrow(datacorr2)
## [1] 50891
sum(is.na(datacorr2))
## [1] 0
After removing the NA values, we are left out with 50891 rows.
Next, we have used few numerical factors, including months of service, outbound calls, received calls, three-way calls, customer care calls, total recurring calls, and monthly minutes, and we’ve conducted correlation statistical studies to look at how they relate to one another using correlation matrix.
# load package
#install.packages("ggstatsplot")
#install.packages("ggcorrplot")
library(ggstatsplot)
library(ggcorrplot)
library(corrplot)
# correlogram
ggstatsplot::ggcorrmat(
data = datacorr2,
type = "nonparametric", # parametric for Pearson, nonparametric for Spearman's correlation
colors = c("darkred", "white", "steelblue"),
title = "Correlation matrix"
)
The above correlation graph leads us to the following conclusions:
H0: Average monthly revenue is similar across different occupations
H1: Average monthly revenue is different across different occupations
## Anova Results
one.way <- aov(MonthlyMinutes ~ Occupation, data = Telecom_Data)
#summary(one.way)
xkabledply(one.way, title = "ANOVA result summary")
P values < 0.05, Which means null hypothesis is rejected
Monthly revenue average differ for occupations